package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.select.Elements;

public class Hexunwang extends BreadthCrawler {
    private String seedurl = "https://www.hexun.com/";
    private String regurl = "http://.*.hexun.com/2022-.*.html";

    public Hexunwang(String crawlPath) {
        super(crawlPath, false);
        addSeed(seedurl);
        setThreads(1);
        //  setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        if (page.matchType("text")&&page.select("div.tip.fl").text().contains("和讯")) {
            String url=page.url();
            String title = page.select("div.layout.mg.articleName>h1").text();
            String context = page.select("div.tip.fl").text();
            String time = context.substring(0, 20);
            String writer=context.substring(20);
            String source = "和讯网";

            String text = page.select("div.art_contextBox>p").text();
            System.out.println(url);
            System.out.println(title);
            System.out.println(writer);

            System.out.println(time);
            System.out.println(text);
        } else {
            Elements elements = page.select("a");
            for (int i = 0; i < elements.size(); i++) {
                if (elements.get(i).attr("abs:href").matches(regurl)) {
                    crawlDatums.add(new CrawlDatum(elements.get(i).attr("abs:href"), "text"));
                }
            }

        }
    }

    public static void main(String[] args) throws Exception {
        Hexunwang he = new Hexunwang("hexu");
        he.start(2);
    }
}
